universities_per_country <- works %>%
distinct(country, University) %>%
count(country, name = "n_universities") %>%
arrange(desc(n_universities)) %>%
collect()
universities_per_country %>%
knitr::kable()
| country | n_universities |
|---|---|
| China | 215 |
| United States | 199 |
| United Kingdom | 60 |
| Japan | 55 |
| Germany | 54 |
| South Korea | 46 |
| Spain | 42 |
| Italy | 41 |
| India | 38 |
| Iran | 36 |
| Australia | 32 |
| Turkey | 31 |
| Poland | 31 |
| Brazil | 31 |
| Canada | 30 |
| France | 28 |
| Taiwan | 21 |
| Netherlands | 13 |
| Austria | 12 |
| Sweden | 12 |
| Russia | 10 |
| South Africa | 9 |
| Switzerland | 8 |
| Egypt | 8 |
| Israel | 8 |
| Greece | 8 |
| Belgium | 8 |
| Finland | 7 |
| New Zealand | 7 |
| Czech Republic | 7 |
| Malaysia | 6 |
| Thailand | 6 |
| Norway | 6 |
| Hungary | 6 |
| Ireland | 6 |
| Mexico | 6 |
| Portugal | 6 |
| Pakistan | 5 |
| Saudi Arabia | 5 |
| Denmark | 5 |
| Singapore | 3 |
| Romania | 3 |
| Colombia | 3 |
| Argentina | 3 |
| Serbia | 3 |
| Chile | 3 |
| Tunisia | 3 |
| Jordan | 2 |
| Slovenia | 2 |
| Slovakia | 2 |
| Nigeria | 2 |
| United Arab Emirates | 2 |
| Algeria | 1 |
| Croatia | 1 |
| Iceland | 1 |
| Estonia | 1 |
| Viet Nam | 1 |
| Ghana | 1 |
| Uruguay | 1 |
| Kuwait | 1 |
| Luxembourg | 1 |
| Lithuania | 1 |
| Morocco | 1 |
| Oman | 1 |
| Uganda | 1 |
| Qatar | 1 |
| Ethiopia | 1 |
| Cyprus | 1 |
| Lebanon | 1 |
# papers per country
papers_per_country <- works %>%
distinct(country, country_code, id, work_frac, author_position, institution_id) %>%
group_by(country, country_code) %>%
summarise(sum_fractional_works = sum(work_frac) %>% round(digits = 1)) %>%
arrange(desc(sum_fractional_works)) %>%
collect()
## `summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
papers_per_country %>%
select(-country_code) %>%
knitr::kable()
| country | sum_fractional_works |
|---|---|
| United States | 132628.9 |
| Brazil | 103447.9 |
| China | 97179.9 |
| United Kingdom | 38102.9 |
| Spain | 34201.0 |
| Germany | 30103.2 |
| Canada | 27258.0 |
| Australia | 21986.6 |
| South Korea | 20156.7 |
| Japan | 20089.4 |
| Italy | 18318.3 |
| Poland | 14124.0 |
| Netherlands | 11053.1 |
| South Africa | 10701.6 |
| Sweden | 10646.7 |
| Taiwan | 9195.1 |
| India | 8771.8 |
| Iran | 8511.9 |
| France | 7881.0 |
| Switzerland | 7804.5 |
| Mexico | 6637.2 |
| Turkey | 6554.0 |
| Belgium | 6295.9 |
| Portugal | 5963.7 |
| Denmark | 5321.0 |
| Malaysia | 5107.1 |
| Israel | 4829.3 |
| Norway | 4701.5 |
| Austria | 4549.8 |
| Colombia | 4403.0 |
| Russia | 4289.4 |
| Argentina | 4039.5 |
| Saudi Arabia | 4037.6 |
| Chile | 3713.8 |
| Finland | 3449.6 |
| Egypt | 3348.2 |
| New Zealand | 2945.4 |
| Singapore | 2847.1 |
| Thailand | 2659.1 |
| Greece | 2643.7 |
| Czech Republic | 2603.4 |
| Ireland | 2268.4 |
| Serbia | 1808.1 |
| Hungary | 1641.8 |
| Slovenia | 1635.0 |
| Croatia | 1505.5 |
| Pakistan | 1188.1 |
| Nigeria | 741.9 |
| Uruguay | 735.7 |
| Romania | 732.9 |
| Estonia | 589.6 |
| Slovakia | 498.1 |
| Jordan | 497.1 |
| Lithuania | 485.9 |
| United Arab Emirates | 415.1 |
| Oman | 404.2 |
| Lebanon | 367.5 |
| Ethiopia | 343.1 |
| Uganda | 335.7 |
| Ghana | 330.2 |
| Kuwait | 312.7 |
| Tunisia | 294.8 |
| Iceland | 242.7 |
| Qatar | 201.9 |
| Luxembourg | 200.6 |
| Viet Nam | 136.0 |
| Cyprus | 135.6 |
| Morocco | 129.3 |
| Algeria | 38.0 |
# average apc
average_apc <- works %>%
# first get rid of duplicates from concepts
distinct(country, country_code, id, work_frac, author_position, institution_id,
APC_in_dollar) %>%
group_by(country) %>%
# compute the average APC using fractional authorships as weights
mutate(sum_frac = sum(work_frac)) %>%
group_by(country, sum_frac) %>%
summarise(mean_apc = sum(work_frac * APC_in_dollar) / sum_frac) %>%
collect()
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
# average APC over time
average_apc_time <- works %>%
# first get rid of duplicates from concepts
distinct(country, country_code, id, work_frac, author_position, institution_id,
APC_in_dollar, publication_year) %>%
group_by(country, country_code, publication_year) %>%
# compute the average APC using fractional authorships as weights
mutate(sum_frac = sum(work_frac)) %>%
group_by(country, country_code, sum_frac, publication_year) %>%
summarise(mean_apc = sum(work_frac * APC_in_dollar) / sum_frac) %>%
collect()
## `summarise()` has grouped output by 'country', 'country_code', 'sum_frac'. You
## can override using the `.groups` argument.
average_apc_time %>%
left_join(wdi, by = c("country_code" = "iso2c")) %>%
ggplot(aes(publication_year, mean_apc)) +
geom_line(aes(group = country), alpha = .3) +
geom_smooth(se = FALSE, colour = "#007FA8") +
facet_wrap(vars(region)) +
scale_x_continuous(breaks = scales::pretty_breaks(6)) +
coord_cartesian(ylim = c(0, 3000)) +
labs(x = NULL, y = "Mean APC")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
All three joined
universities_per_country %>%
left_join(papers_per_country, by = "country") %>%
left_join(average_apc, by = "country") %>%
arrange(desc(n_universities), desc(sum_fractional_works)) %>%
mutate(mean_apc = round(mean_apc, digits = 1)) %>%
select(Country = country, `n universities` = n_universities,
`n fractional publications` = sum_fractional_works,
`Mean APC` = mean_apc) %>%
knitr::kable()
| Country | n universities | n fractional publications | Mean APC |
|---|---|---|---|
| China | 215 | 97179.9 | 1860.1 |
| United States | 199 | 132628.9 | 1924.8 |
| United Kingdom | 60 | 38102.9 | 1813.0 |
| Japan | 55 | 20089.4 | 1807.2 |
| Germany | 54 | 30103.2 | 1801.3 |
| South Korea | 46 | 20156.7 | 1705.1 |
| Spain | 42 | 34201.0 | 675.1 |
| Italy | 41 | 18318.3 | 1688.0 |
| India | 38 | 8771.8 | 952.2 |
| Iran | 36 | 8511.9 | 774.7 |
| Australia | 32 | 21986.6 | 1779.0 |
| Brazil | 31 | 103447.9 | 251.7 |
| Poland | 31 | 14124.0 | 836.3 |
| Turkey | 31 | 6554.0 | 816.8 |
| Canada | 30 | 27258.0 | 1691.5 |
| France | 28 | 7881.0 | 1578.3 |
| Taiwan | 21 | 9195.1 | 1841.0 |
| Netherlands | 13 | 11053.1 | 1816.9 |
| Sweden | 12 | 10646.7 | 1835.1 |
| Austria | 12 | 4549.8 | 1690.3 |
| Russia | 10 | 4289.4 | 467.6 |
| South Africa | 9 | 10701.6 | 939.8 |
| Switzerland | 8 | 7804.5 | 2006.9 |
| Belgium | 8 | 6295.9 | 1707.9 |
| Israel | 8 | 4829.3 | 2009.7 |
| Egypt | 8 | 3348.2 | 925.9 |
| Greece | 8 | 2643.7 | 1496.4 |
| Finland | 7 | 3449.6 | 1570.5 |
| New Zealand | 7 | 2945.4 | 1672.9 |
| Czech Republic | 7 | 2603.4 | 1045.5 |
| Mexico | 6 | 6637.2 | 682.3 |
| Portugal | 6 | 5963.7 | 778.8 |
| Malaysia | 6 | 5107.1 | 1112.9 |
| Norway | 6 | 4701.5 | 1564.4 |
| Thailand | 6 | 2659.1 | 1521.5 |
| Ireland | 6 | 2268.4 | 1781.7 |
| Hungary | 6 | 1641.8 | 1415.6 |
| Denmark | 5 | 5321.0 | 1740.6 |
| Saudi Arabia | 5 | 4037.6 | 1432.9 |
| Pakistan | 5 | 1188.1 | 1040.9 |
| Colombia | 3 | 4403.0 | 237.3 |
| Argentina | 3 | 4039.5 | 294.2 |
| Chile | 3 | 3713.8 | 564.0 |
| Singapore | 3 | 2847.1 | 1999.8 |
| Serbia | 3 | 1808.1 | 568.3 |
| Romania | 3 | 732.9 | 881.5 |
| Tunisia | 3 | 294.8 | 1194.7 |
| Slovenia | 2 | 1635.0 | 856.7 |
| Nigeria | 2 | 741.9 | 1118.3 |
| Slovakia | 2 | 498.1 | 653.1 |
| Jordan | 2 | 497.1 | 1309.1 |
| United Arab Emirates | 2 | 415.1 | 1776.7 |
| Croatia | 1 | 1505.5 | 385.3 |
| Uruguay | 1 | 735.7 | 382.9 |
| Estonia | 1 | 589.6 | 1005.0 |
| Lithuania | 1 | 485.9 | 793.8 |
| Oman | 1 | 404.2 | 527.5 |
| Lebanon | 1 | 367.5 | 1694.5 |
| Ethiopia | 1 | 343.1 | 1664.7 |
| Uganda | 1 | 335.7 | 1784.9 |
| Ghana | 1 | 330.2 | 1564.3 |
| Kuwait | 1 | 312.7 | 1634.9 |
| Iceland | 1 | 242.7 | 1425.6 |
| Qatar | 1 | 201.9 | 1542.5 |
| Luxembourg | 1 | 200.6 | 1751.8 |
| Viet Nam | 1 | 136.0 | 1314.0 |
| Cyprus | 1 | 135.6 | 1552.3 |
| Morocco | 1 | 129.3 | 882.8 |
| Algeria | 1 | 38.0 | 732.7 |
# needs to be rechecked once the figures are correct!!!!!!
plot_data <- papers_per_country %>%
left_join(wdi, by = c("country_code" = "iso2c")) %>%
group_by(region) %>%
summarise(sum_fractional_works = sum(sum_fractional_works)) %>%
mutate(prop = sum_fractional_works / sum(sum_fractional_works),
label = glue::glue(
"{scales::comma(sum_fractional_works)} ({scales::percent(prop, accuracy = .1)})"
)
)
plot_data %>%
ggplot(aes(sum_fractional_works, fct_reorder(region, prop))) +
geom_col(width = .5) +
geom_text(aes(label = label), nudge_x = 2000, hjust = "left") +
scale_x_continuous(expand = expansion(mult = c(0.05, .25)),
labels = scales::comma) +
labs(x = "Number of fractional publications", y = NULL) +
theme(panel.grid = element_blank(),
panel.grid.major.x = element_line(colour = "grey92"))
How many papers do we have, which also have a topic?
works %>%
filter(!is.na(field)) %>%
distinct(id) %>%
sdf_nrow()
## [1] 1633554
This is our total sample size.
Which topics are represented in our sample?
frac_concept_papers <- works %>%
distinct(id, field, concept_frac) %>%
group_by(field) %>%
summarise(frac_papers = sum(concept_frac)) %>%
arrange(desc(frac_papers)) %>%
collect()
plot_data <- frac_concept_papers %>%
drop_na() %>%
mutate(prop = frac_papers / sum(frac_papers),
label = glue::glue(
"{scales::comma(frac_papers)} ({scales::percent(prop, accuracy = .1)})"
)
)
plot_data %>%
ggplot(aes(frac_papers, fct_reorder(field, prop))) +
geom_col(width = .5) +
geom_text(aes(label = label), nudge_x = 5000, hjust = "left") +
scale_x_continuous(expand = expansion(mult = c(0.05, .25)),
labels = scales::comma) +
labs(x = "Number of fractional publications", y = NULL) +
theme(panel.grid = element_blank(),
panel.grid.major.x = element_line(colour = "grey92"))
get_mean_apc_by_author_position <- function(df) {
df %>%
# first get rid of duplicates from concepts
distinct(id, author_position, work_frac, APC_in_dollar, University, country,
publication_year, P_top10) %>%
group_by(University, publication_year, country, P_top10) %>%
# compute the average APC using fractional authorships as weights
mutate(sum_frac = sum(work_frac)) %>%
group_by(University, publication_year, country, P_top10, sum_frac,
author_position) %>%
summarise(mean_apc = sum(work_frac * APC_in_dollar) / sum_frac,
fractional_works = sum(work_frac))
}
mean_apcs <- works %>%
filter(publication_year == last_year_of_period) %>%
get_mean_apc_by_author_position()
mean_apcs_local <- mean_apcs %>%
collect()
## `summarise()` has grouped output by 'University', 'publication_year', 'country',
## 'P_top10', 'sum_frac'. You can override using the `.groups` argument.
mean_apc_16_19 <- works %>%
filter(first_year_of_period == 2016) %>%
get_mean_apc_by_author_position()
mean_apc_16_19_local <- mean_apc_16_19 %>%
collect()
## `summarise()` has grouped output by 'University', 'publication_year', 'country',
## 'P_top10', 'sum_frac'. You can override using the `.groups` argument.
# plot for 2016-19
# taking out the correlation, because they are incorrect given that the figure
# shows a non-linear relationship (x-axis logged), but the correlation is linear
# (and quite unsuitable to the skewed P_top10)
mean_apc_16_19_local %>%
mutate(author_position = recode(author_position, first = "First authors",
last = "Last authors")) %>%
ggplot(aes(P_top10, mean_apc, colour = fractional_works)) +
geom_point(aes(), alpha = .5) +
scale_colour_continuous_sequential(palette = "Mako", trans = "log10") +
geom_smooth(colour = "grey30") +
facet_wrap(vars(author_position)) +
scale_x_log10() +
scale_y_continuous(labels = dollar) +
labs(caption = "Fractional counting; 2016-2019", y = "Mean APC",
colour = "Number of papers per institution",
x = expression(P["top 10%"])) +
theme(legend.position = "top",
legend.key.width = unit(1, 'cm'))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
mean_apcs_local %>%
group_by(publication_year, author_position) %>%
mutate(ptop10_quantiles = cut_quartiles(P_top10)) %>%
group_by(ptop10_quantiles, publication_year, author_position) %>%
summarise(mean_apc = weighted.mean(mean_apc, sum_frac, na.rm = TRUE),
.groups = "drop_last") %>%
ggplot(aes(publication_year, mean_apc, colour = ptop10_quantiles,
group = ptop10_quantiles)) +
geom_line() +
facet_wrap(vars(author_position)) +
scale_x_continuous(breaks = seq(2010, 2018, by = 4)) +
scale_y_continuous(labels = dollar) +
scale_color_discrete_qualitative(palette = "Dark 3") +
theme(legend.position = "top") +
labs(caption = "Fractional counting", y = "Mean APC",
colour = expression(P["top 10%"]), x = NULL)
get_mean_apc_by_concept <- function(df) {
df %>%
distinct(id, University, publication_year, P_top10, field, work_frac,
APC_in_dollar, author_position) %>%
group_by(University, publication_year, P_top10, field) %>%
# spark is unhappy for some reason, so coerce again to numeric
mutate(work_frac = as.numeric(work_frac)) %>%
# compute the average APC using fractional authorships as weights
mutate(sum_frac = sum(work_frac)) %>%
group_by(University, publication_year, P_top10, sum_frac,
author_position, field) %>%
summarise(mean_apc = sum(work_frac * APC_in_dollar) / sum_frac)
}
mean_apc_concept <- works %>%
filter(publication_year == last_year_of_period) %>%
get_mean_apc_by_concept()
mean_apc_concept_local <- mean_apc_concept %>%
collect()
## `summarise()` has grouped output by 'University', 'publication_year', 'P_top10',
## 'sum_frac', 'author_position'. You can override using the `.groups` argument.
mean_apc_concept_16_19 <- works %>%
filter(first_year_of_period == 2016) %>%
get_mean_apc_by_concept()
mean_apc_concept_16_19_local <- mean_apc_concept_16_19 %>%
collect()
## `summarise()` has grouped output by 'University', 'publication_year', 'P_top10',
## 'sum_frac', 'author_position'. You can override using the `.groups` argument.
# plot for 2016-2019
p <- mean_apc_concept_16_19_local %>%
drop_na(field) %>%
ggplot(aes(P_top10, mean_apc, colour = field)) +
geom_smooth(alpha = .15) +
facet_wrap(vars(author_position), nrow = 1) +
scale_x_log10() +
scale_y_continuous(labels = dollar) +
labs(caption = "2016-2019", y = "Mean APC",
x = expression(P["top 10%"]))
p
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
plotly::ggplotly(p)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Using ggrepel
field_selection <- mean_apc_concept_16_19_local %>%
drop_na(field) %>%
filter(field %in% c("Medicine", "History", "Sociology", "Biology",
"Materials science", "Physics"))
mean_apc_concept_16_19_local %>%
anti_join(field_selection) %>%
ggplot(aes(P_top10, mean_apc, group = field)) +
geom_smooth(alpha = .3, colour = "grey80", fill = "grey90") +
geom_smooth(aes(colour = field), data = field_selection) +
facet_wrap(vars(author_position), nrow = 1) +
scale_x_log10() +
scale_y_continuous(labels = dollar) +
scale_color_discrete_qualitative(palette = "Dark 3") +
labs(caption = "2016-2019", y = "Mean APC", colour = NULL,
x = expression(P["top 10%"])) +
# theme(legend.position = "top") +
guides(colour = guide_legend(override.aes = list(alpha = 0)))
## Joining, by = c("University", "publication_year", "P_top10", "sum_frac",
## "author_position", "field", "mean_apc")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
mean_apc_country_16_19 <- works %>%
filter(first_year_of_period == 2016) %>%
# first get rid of duplicates from concepts
distinct(id, work_frac, APC_in_dollar, University, country, P_top10, country_code) %>%
# spark is unhappy for some reason, so coerce again to numeric
mutate(work_frac = as.numeric(work_frac)) %>%
group_by(University, country, P_top10) %>%
# compute the average APC using fractional authorships as weights
mutate(sum_frac = sum(work_frac)) %>%
group_by(University, country, P_top10, sum_frac, country_code) %>%
summarise(mean_apc = sum(work_frac * APC_in_dollar) / sum_frac)
mean_apc_country_16_19_local <- mean_apc_country_16_19 %>%
collect()
## `summarise()` has grouped output by 'University', 'country', 'P_top10',
## 'sum_frac'. You can override using the `.groups` argument.
mean_apc_country_16_19_local <- mean_apc_country_16_19_local %>%
left_join(wdi, by = c("country_code" = "iso2c"))
mean_apc_country_16_19_local %>%
ggplot(aes(P_top10, mean_apc, colour = region)) +
geom_point(alpha = .3, size = 1.2) +
geom_smooth(alpha = .3) +
scale_x_log10() +
scale_y_continuous(labels = dollar) +
scale_color_discrete_qualitative(palette = "Dark 3") +
labs(y = "Mean APC", x = expression(P["top 10%"]), colour = NULL) +
theme(legend.position = "top") +
guides(colour = guide_legend(override.aes = list(alpha = 0)))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
spark_disconnect(sc)